set more off

import delimited "1. dataguide.csv", encoding(UTF-8) clear

ren symbol Symbol
ren name Name
ren engname EngName
ren businum BusiNum
ren corpnum CorpNum

destring CorpNum, ignore(-) replace

*주소 정리
replace location = subinstr(location," 특별시","특별시",30)
replace location = subinstr(location," 광역시","광역시",30)

gen loc1 = word(location,1) 
gen loc2 = word(location,2)

ren location temp
ren loc1 location

replace location = subinword(location,"서울시","서울특별시",30)
replace location = subinword(location,"서울","서울특별시",30)
replace location = subinword(location,"부산시","부산광역시",30)
replace location = subinword(location,"부산","부산광역시",30)
replace location = subinword(location,"인천시","인천광역시",30)
replace location = subinword(location,"인천","인천광역시",30)
replace location = subinword(location,"대구시","대구광역시",30)
replace location = subinword(location,"대구","대구광역시",30)
replace location = subinword(location,"대전시","대전광역시",30)
replace location = subinword(location,"대전","대전광역시",30)
replace location = subinword(location,"광주시","광주광역시",30)
replace location = subinword(location,"광주","광주광역시",30)
replace location = subinword(location,"울산시","울산광역시",30)
replace location = subinword(location,"울산","울산광역시",30)
replace location = subinword(location,"경기","경기도",30)
replace location = subinword(location,"강원","강원도",30)
replace location = subinword(location,"충남","충청남도",30)
replace location = subinword(location,"충북","충청북도",30)
replace location = subinword(location,"전남","전라남도",30)
replace location = subinword(location,"전북","전라북도",30)
replace location = subinword(location,"경남","경상남도",30)
replace location = subinword(location,"경북","경상북도",30)
replace location = subinword(location,"제주도","제주특별자치도",30)
replace location = subinword(location,"제주","제주특별자치도",30)
replace location = subinword(location,"세종시","세종특별자치시",30)
replace location = subinword(location,"세종","세종특별자치시",30)

ren location loc1
ren temp location


*한국이름 정리
replace Name=upper(Name)
replace Name = subinstr(Name,"주식회사","",30)

replace Name = subinstr(Name,"(주)","",30)
replace Name = subinstr(Name,"(주","",30)
replace Name = subinstr(Name,"주)","",30)
replace Name = subinstr(Name,"[주]","",30)
replace Name = subinstr(Name,"주]","",30)
replace Name = subinstr(Name,"[주","",30)

replace Name = subinstr(Name,"(유)","",30)
replace Name = subinstr(Name,"(유","",30)
replace Name = subinstr(Name,"유)","",30)
replace Name = subinstr(Name,"[유]","",30)
replace Name = subinstr(Name,"유]","",30)
replace Name = subinstr(Name,"[유","",30)

replace Name = subinstr(Name,"(재)","",30)
replace Name = subinstr(Name,"(재","",30)
replace Name = subinstr(Name,"재)","",30)
replace Name = subinstr(Name,"[재]","",30)
replace Name = subinstr(Name,"재]","",30)
replace Name = subinstr(Name,"[재","",30)

replace Name = subinstr(Name,"(합)","",30)
replace Name = subinstr(Name,"(합","",30)
replace Name = subinstr(Name,"합)","",30)
replace Name = subinstr(Name,"[합]","",30)
replace Name = subinstr(Name,"합]","",30)
replace Name = subinstr(Name,"[합","",30)

replace Name = subinstr(Name, "'",  "", 30)
replace Name = subinstr(Name, ";",  "", 30)
replace Name = subinstr(Name, "^",  "", 30)
replace Name = subinstr(Name, "<",  "", 30)
replace Name = subinstr(Name, ".",  "", 30)
replace Name = subinstr(Name, "`",  "", 30)
replace Name = subinstr(Name, "_",  "", 30)
replace Name = subinstr(Name, ">",  "", 30)
replace Name = subinstr(Name, "''", "", 30)
replace Name = subinstr(Name, "!",  "", 30)
replace Name = subinstr(Name, "+",  "", 30)
replace Name = subinstr(Name, "?",  "", 30)
replace Name = subinstr(Name, "(",  "", 30) 
replace Name = subinstr(Name, "â",  "", 30)
replace Name = subinstr(Name, "{",  "", 30)
replace Name = subinstr(Name, "\",  "", 30)
replace Name = subinstr(Name, ")",  "", 30) 
replace Name = subinstr(Name, "$",  "", 30)
replace Name = subinstr(Name, "}",  "", 30)
replace Name = subinstr(Name, "|",  "", 30)
replace Name = subinstr(Name, ",",  "", 30)
replace Name = subinstr(Name, "%",  "", 30)
replace Name = subinstr(Name, "[",  "", 30)
replace Name = subinstr(Name, "Ƣ",  "", 30)
replace Name = subinstr(Name, "*",  "", 30)
replace Name = subinstr(Name, "]",  "", 30)
replace Name = subinstr(Name, "/",  "", 30) 
replace Name = subinstr(Name, "@",  "", 30)
replace Name = subinstr(Name, ":",  "", 30)
replace Name = subinstr(Name, "~",  "", 30)
replace Name = subinstr(Name, "#",  "", 30)
replace Name = subinstr(Name, "-",  "", 30)
replace Name = subinstr(Name, " ", "", 30)

*사업자등록번호/법인등록번호/설립일 일치하는 경우 표시하고 중복제거
gen temp = 1
sort BusiNum CorpNum fdate Symbol
by BusiNum CorpNum fdate: egen dup_num = sum(temp) 
replace dup_num = dup_num - 1

duplicates drop BusiNum CorpNum fdate, force

*한글명과 주소 일치하는 경우 표시하고 중복제거
sort Name loc1 loc2 fdate
by Name loc1 loc2: egen dup_namloc = sum(temp) 
replace dup_namloc = dup_namloc - 1

duplicates drop Name loc1 loc2, force

sa dg_temp.dta, replace

*dictionary
use dg_temp.dta, clear

drop if CorpNum == .
keep Symbol Name EngName CorpNum
so Corp
qui by Corp: gen dup = cond(_N==1,1,_n)
reshape wide Name EngName Symbol, i(CorpNum) j(dup)

sa dg_dict, replace 


 
